In [24]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import random
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
In [25]:
# 10대 매출비율,20대 매출비율 평균영업개월수 점포수 총 매출액
data=pd.read_csv("plus_living.csv")
data["total_living_people"]=data["total_living_people"] /1000
data["idx"]=np.arange(31547)
del data['Unnamed: 0']
del data['cm_code_name']

st_data=pd.DataFrame(data,columns=["10's_sales_rate","20's_sales_rate","30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate","simillar_store_number"])

# standardScaler 함수 사용하여 data 스케일링, 에러는 한글때문임, 한글 외에 모든 데이터 스케일링 됌
def standardScaler(header_list):
    over_array = st_data[header_list]
    a = pd.DataFrame(over_array)
    a["idx"] = np.arange(31547)
    train_data = a.values
    
    standardScaler = StandardScaler()
    print(standardScaler.fit(train_data))
    train_data_standardScaled = standardScaler.transform(train_data)
    st_df = pd.DataFrame(train_data_standardScaled, columns=['standard_over','B'])
    del st_df['B']
    st_data[header_list] = st_df
    
# columns들중에서 선별하여 스케일링 수행
header = list(data)
header
for a in header:
    if(a == "10's_sales_rate" or  a == "20's_sales_rate" or a == "30's_sales_rate" or  a == "40's_sales_rate" or a == "50's_sales_rate" or  a == "60's_sales_rate" or a == "simillar_store_number"):
        standardScaler("{}".format(a))

#clustering
test_kmeans=pd.DataFrame(st_data,columns=["10's_sales_rate","20's_sales_rate","30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate","simillar_store_number"])
data_points=test_kmeans.values
kmeans=KMeans(n_clusters=5).fit(data_points)
test_kmeans['cluster_id']=kmeans.labels_
df = test_kmeans.copy()
test_kmeans["idx"] = np.arange(31547)

test_kmeans=pd.DataFrame(test_kmeans,columns=["cluster_id","idx"])
new=pd.merge(data,test_kmeans,on=["idx"])
del new["idx"]
test0 = rateof_1020_business_stnum0=new[new['cluster_id']==0]
test1 = rateof_1020_business_stnum1=new[new['cluster_id']==1]
test2 = rateof_1020_business_stnum2=new[new['cluster_id']==2]
test3 = rateof_1020_business_stnum3=new[new['cluster_id']==3]
test4 = rateof_1020_business_stnum4=new[new['cluster_id']==4]
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
In [26]:
Mean = pd.DataFrame({'test0' : test0.mean(), 'test1' : test1.mean(), 'test2' : test2.mean(), 'test3' : test3.mean(), 'test4' : test4.mean()})

Meandf = Mean.T
A = pd.DataFrame(Meandf , columns = ["act_jipyo_value", 'growth_jipyo_value', 'safety_jipyo_value'])
A = A.T
Mean
Out[26]:
test0 test1 test2 test3 test4
dates 2.018100e+05 2.018100e+05 2.018100e+05 2.018100e+05 2.018100e+05
cm_code 8.853832e+02 8.844689e+02 9.540104e+02 8.953502e+02 8.984819e+02
service_code 2.083884e+05 2.209239e+05 2.016142e+05 1.751763e+05 2.042190e+05
over_jisu_value 4.925953e+01 4.881437e+01 4.852918e+01 4.883561e+01 4.904102e+01
act_jipyo_value 5.101726e+01 5.102993e+01 5.238094e+01 5.171305e+01 5.031214e+01
growth_jipyo_value 5.415332e+01 5.244782e+01 5.660254e+01 5.272788e+01 5.283213e+01
safety_jipyo_value 5.511827e+01 5.508823e+01 5.429945e+01 5.412280e+01 5.506596e+01
business_month_avg 6.467960e+01 9.682911e+01 5.290888e+01 5.752110e+01 6.260569e+01
simillar_store_number 3.606083e+00 2.998210e+00 2.632672e+00 3.249488e+00 2.354731e+00
total_moving_people 4.992590e+04 5.178091e+04 5.981758e+04 5.709191e+04 5.158410e+04
10's_moving_people 2.978667e+03 2.889444e+03 4.850263e+03 3.207294e+03 2.751469e+03
20's_moving_people 1.218954e+04 1.220398e+04 1.916427e+04 1.649775e+04 1.311604e+04
30's_moving_people 1.017153e+04 1.030059e+04 1.068807e+04 1.166717e+04 1.081712e+04
40's_moving_people 8.617181e+03 8.916210e+03 8.908511e+03 9.157886e+03 8.782874e+03
50's_moving_people 8.546965e+03 9.315735e+03 8.734556e+03 8.925229e+03 8.668763e+03
60's_moving_people 7.422064e+03 8.154996e+03 7.471942e+03 7.636634e+03 7.447897e+03
c_month_sales_amount 3.219436e+07 6.769874e+07 2.870573e+07 6.178005e+07 2.305094e+07
10's_sales_rate 3.189169e-01 5.529788e-01 1.923211e+01 2.157160e+00 1.370154e-01
20's_sales_rate 2.223999e+00 7.101159e+00 3.681662e+01 3.165609e+01 6.476657e+00
30's_sales_rate 1.032489e+01 1.316852e+01 1.185722e+01 2.482802e+01 6.171897e+01
40's_sales_rate 7.286105e+01 2.112593e+01 1.573173e+01 2.044646e+01 2.229454e+01
50's_sales_rate 1.114841e+01 3.163107e+01 1.174457e+01 1.422327e+01 6.058024e+00
60's_sales_rate 3.122255e+00 2.642082e+01 4.617092e+00 6.689407e+00 3.315006e+00
10's_sales_amount 1.115809e+05 1.528737e+05 2.604664e+06 8.610713e+05 1.277253e+04
20's_sales_amount 4.426409e+05 3.145390e+06 1.081540e+07 1.430399e+07 1.271798e+06
30's_sales_amount 3.008583e+06 7.952733e+06 3.584682e+06 1.324802e+07 1.267317e+07
40's_sales_amount 2.085245e+07 1.374011e+07 4.387827e+06 1.096463e+07 5.721722e+06
50's_sales_amount 5.147903e+06 1.720797e+07 3.149394e+06 8.050490e+06 1.175040e+06
60's_sales_amount 8.922404e+05 1.766014e+07 1.244971e+06 4.292397e+06 6.977070e+05
store_number 2.676187e+00 2.853234e+00 2.231350e+00 3.368155e+00 1.836599e+00
man 2.515327e+04 2.616929e+04 3.001195e+04 2.879252e+04 2.605340e+04
woman 2.477260e+04 2.561161e+04 2.980560e+04 2.829939e+04 2.553069e+04
mon 7.095892e+03 7.340310e+03 8.667790e+03 8.114266e+03 7.313285e+03
tue 7.068577e+03 7.311293e+03 8.635997e+03 8.096746e+03 7.297935e+03
wed 7.126001e+03 7.369211e+03 8.637568e+03 8.163754e+03 7.359404e+03
thu 7.116959e+03 7.346801e+03 8.682696e+03 8.172524e+03 7.340215e+03
fri 7.236377e+03 7.489968e+03 8.607411e+03 8.302609e+03 7.482086e+03
sat 7.244277e+03 7.552728e+03 8.451462e+03 8.316105e+03 7.528140e+03
sun 7.037860e+03 7.370650e+03 8.134627e+03 7.925935e+03 7.263051e+03
weekend 1.428214e+04 1.492338e+04 1.658609e+04 1.624204e+04 1.479119e+04
total_living_people 3.093916e+01 3.010790e+01 2.836930e+01 2.821042e+01 2.930059e+01
10's_living_people 4.590787e+03 4.245847e+03 4.036725e+03 3.833295e+03 4.113516e+03
20's_living_people 4.584739e+03 4.481735e+03 4.562784e+03 4.514082e+03 4.440980e+03
30's_living_people 4.952064e+03 4.808060e+03 4.470381e+03 4.741270e+03 4.839553e+03
40's_living_people 5.027735e+03 4.783349e+03 4.522239e+03 4.421541e+03 4.665560e+03
50's_living_people 4.942997e+03 4.912836e+03 4.506154e+03 4.439763e+03 4.690371e+03
60's_living_people 6.840834e+03 6.876072e+03 6.271016e+03 6.260471e+03 6.550609e+03
cluster_id 0.000000e+00 1.000000e+00 2.000000e+00 3.000000e+00 4.000000e+00
In [27]:
A.plot(figsize=(12, 4), legend=True, fontsize=15)
Mean = pd.DataFrame({'test0' : test0.mean(), 'test1' : test1.mean(), 'test2' : test2.mean(), 'test3' : test3.mean(), 'test4' : test4.mean()})

Meandf = Mean.T
A2 = pd.DataFrame(Meandf , columns = ["10's_sales_rate", "20's_sales_rate", "30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate"])
A2 = A2.T
A2.plot(figsize=(12, 4), legend=True, fontsize=15)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x19c037ab860>
In [28]:
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
code_name = pd.read_csv("code_name.csv")
test0 = pd.merge(test0 , code_name, on="service_code")
test1 = pd.merge(test1 , code_name, on="service_code")
test2 = pd.merge(test2 , code_name, on="service_code")
test3 = pd.merge(test3 , code_name, on="service_code")
test4 = pd.merge(test4 , code_name, on="service_code")
plt.rcParams.update({'font.size': 40})
fig = plt.figure(figsize=(50,50))
ax1 = fig.add_subplot(321)
df0 = test0.service_code_name.value_counts()
df0=df0.head(5)
df0.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax2 = fig.add_subplot(322)
df1 = test1.service_code_name.value_counts()
df1=df1.head(5)
df1.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax3 = fig.add_subplot(323)
df2 = test2.service_code_name.value_counts()
df2=df2.head(5)
df2.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax4 = fig.add_subplot(324)
df3 = test3.service_code_name.value_counts()
df3=df3.head(5)
df3.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax5 = fig.add_subplot(325)
df4 = test4.service_code_name.value_counts()
df4=df4.head(5)
df4.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
plt.show()
In [29]:
import plotly.express as px
px.box(new,x="cluster_id",y="10's_sales_rate",color='cluster_id',notched=True)
In [30]:
px.box(new,x="cluster_id",y="20's_sales_rate",color='cluster_id',notched=True)
In [31]:
px.box(new,x="cluster_id",y="30's_sales_rate",color='cluster_id',notched=True)
In [32]:
px.box(new,x="cluster_id",y="40's_sales_rate",color='cluster_id',notched=True)
In [33]:
px.box(new,x="cluster_id",y="50's_sales_rate",color='cluster_id',notched=True)
In [34]:
px.box(new,x="cluster_id",y="60's_sales_rate",color='cluster_id',notched=True)
In [35]:
px.box(new,x="cluster_id",y="simillar_store_number",color='cluster_id',notched=True)
In [36]:
from sklearn.decomposition import PCA
plt.rcParams['axes.unicode_minus'] = False
pca = PCA(n_components=3)
X_scaled = st_data
pca.fit(X_scaled)
plt.rcParams.update({'font.size': 10})
# 처음 두 개의 주성분을 사용해 데이터를 변환합니다
X_pca = pca.transform(X_scaled)
X_pca = pd.DataFrame(X_pca)

print("원본 데이터 형태: {}".format(str(X_scaled.shape)))
print("축소된 데이터 형태: {}".format(str(X_pca.shape)))
clusterid=pd.DataFrame(new,columns=["cluster_id"])
X_pca['cluster_id'] = clusterid

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot( projection='3d')

x = X_pca[0]
y = X_pca[1]
z = X_pca[2]

ax.scatter(x,y,z, c=X_pca['cluster_id'], marker='o')

ax.set_xlabel('x axis')
ax.set_ylabel('y axis')
ax.set_zlabel('z axis')

plt.show()
원본 데이터 형태: (31547, 7)
축소된 데이터 형태: (31547, 3)
In [4]:
import pandas_profiling as pp
pp.ProfileReport(test0)
Out[4]:

In [5]:
pp.ProfileReport(test1)
Out[5]:

In [6]:
pp.ProfileReport(test2)
Out[6]:

In [7]:
pp.ProfileReport(test3)
Out[7]:

In [8]:
pp.ProfileReport(test4)
Out[8]:

In [37]:
df.head()
Out[37]:
10's_sales_rate 20's_sales_rate 30's_sales_rate 40's_sales_rate 50's_sales_rate 60's_sales_rate simillar_store_number cluster_id
0 -0.417851 -1.016262 -1.326808 -1.352863 0.713260 3.609736 -0.552477 1
1 1.415834 2.611103 -0.381812 -1.352863 -0.788093 -0.607859 0.500985 3
2 -0.211562 -0.176182 -0.259085 -0.234474 0.242879 0.582383 -0.552477 1
3 1.805492 -0.293015 0.501822 -0.132313 -0.382148 -0.148581 -0.552477 3
4 -0.096956 1.270313 0.348413 -0.438795 -0.697883 -0.588453 1.291082 3
In [38]:
X_train, X_test = train_test_split(df, test_size=0.3, random_state=0)
y_train = X_train['cluster_id']
X_train = X_train.drop(['cluster_id'],axis=1)
y_test = X_test['cluster_id']
X_test = X_test.drop(['cluster_id'],axis =1)
X_train.shape, X_test.shape
Out[38]:
((22082, 7), (9465, 7))
In [39]:
forest = RandomForestClassifier(n_estimators=600,max_depth = 12, random_state=42,criterion = 'entropy',max_features='auto')
forest.fit(X_train,y_train)
pred = forest.predict(X_test)
print("훈련 세트 정확도 : {:.3f}".format(forest.score(X_train,y_train)))
print("테스트 세트 정확도 : {:.3f}".format(forest.score(X_test,y_test)))
print("accuracy : ",accuracy_score(y_test,pred))
훈련 세트 정확도 : 1.000
테스트 세트 정확도 : 0.986
accuracy :  0.9856312731114633
In [40]:
estimator = forest.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = X_train.columns,
                class_names = 'cluster_id',
                rounded = True, proportion = False, 
                precision = 2, filled = True)

from IPython.display import Image
Image(filename = 'tree.jpeg')
Out[40]:
In [41]:
def plot_feature_importances_cancer(model):

    n_features = X_train.shape[1]

    plt.barh(range(n_features), model.feature_importances_, align='center')

    plt.yticks(np.arange(n_features), X_train.columns)

    plt.xlabel("attr importances")

    plt.ylabel("attr")

    plt.ylim(-1, n_features)
plt.rcParams.update({'font.size': 15})
plt.show()
plot_feature_importances_cancer(forest)
In [42]:
from mpl_toolkits.mplot3d import Axes3D
origin = X_test.copy()
origin['cluster_id'] = y_test
fig = plt.figure(figsize = (15,20))
###############################
ax = fig.add_subplot(321,projection = '3d')
ax.scatter(origin["20's_sales_rate"],
           origin["30's_sales_rate"],
           origin["40's_sales_rate"],c = origin['cluster_id'])
ax.set_xlabel("20's_sales_rate")
ax.set_ylabel("30's_sales_rate")
ax.set_zlabel("40's_sales_rate")
plt.title("original data of test data")

test = X_test.copy()
test['cluster_id'] = pred
ax = fig.add_subplot(322,projection = '3d')
ax.scatter(test["20's_sales_rate"],
           test["30's_sales_rate"],
           test["40's_sales_rate"],c = test['cluster_id'])
ax.set_xlabel("20's_sales_rate")
ax.set_ylabel("30's_sales_rate")
ax.set_zlabel("40's_sales_rate")
plt.title("predict data of test data")
##############################
ax = fig.add_subplot(323,projection = '3d')
ax.scatter(origin["50's_sales_rate"],
           origin["60's_sales_rate"],
           origin["simillar_store_number"],c = origin['cluster_id'])
ax.set_xlabel("50's_sales_rate")
ax.set_ylabel("60's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("original data of test data")

ax = fig.add_subplot(324,projection = '3d')
ax.scatter(test["50's_sales_rate"],
           test["60's_sales_rate"],
           test["simillar_store_number"],c = test['cluster_id'])
ax.set_xlabel("50's_sales_rate")
ax.set_ylabel("60's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("predict data of test data")
##############################
ax = fig.add_subplot(325,projection = '3d')
ax.scatter(origin["10's_sales_rate"],
           origin["50's_sales_rate"],
           origin["simillar_store_number"],c = origin['cluster_id'])
ax.set_xlabel("10's_sales_rate")
ax.set_ylabel("50's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("original data of test data")

ax = fig.add_subplot(326,projection = '3d')
ax.scatter(test["10's_sales_rate"],
           test["50's_sales_rate"],
           test["simillar_store_number"],c = test['cluster_id'])
ax.set_xlabel("10's_sales_rate")
ax.set_ylabel("50's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("predict data of test data")

plt.show()
In [43]:
import seaborn as sns
array = confusion_matrix(origin['cluster_id'], test['cluster_id'])
labels = ['0','1','2','3','4']
labels = np.asarray(labels)
sns.heatmap(array, annot=True, fmt='',cmap = 'BuPu')
from sklearn.metrics import classification_report
result = classification_report(origin['cluster_id'], test['cluster_id'], target_names=['class 0', 'class 1','class 2','class 3','class 4'])
print(result)
              precision    recall  f1-score   support

     class 0       0.98      0.98      0.98       798
     class 1       0.99      0.99      0.99      3463
     class 2       0.99      0.97      0.98       329
     class 3       0.99      0.99      0.99      4153
     class 4       0.99      0.98      0.99       722

    accuracy                           0.99      9465
   macro avg       0.99      0.98      0.98      9465
weighted avg       0.99      0.99      0.99      9465

In [ ]: